##################################################

##################################################

########
## SETUP
########
knitr::opts_chunk$set(echo = TRUE)
rm(list=ls()) 
set.seed(847509348)

packageurl <- "http://cran.r-project.org/src/contrib/Archive/grf/grf_0.10.3.tar.gz"
install.packages(packageurl, repos=NULL, type="source")

library(grf)
## packages - check if installed and load
packages <- c("haven", "dplyr", "caret", "fastDummies")

package.check <- lapply(packages, FUN = function(x) {
  if (!require(x, character.only = TRUE)) {
    install.packages(x, dependencies = TRUE)
    library(x, character.only = TRUE)
  }
})

username <- Sys.getenv("USERNAME")
dta_wd <- paste0("C:/Users/", username, "/Dropbox/CFPB SLCCU Credit Matters Loan Evaluation/CBL_replication_package/03_dta/04_analysis_dtas")
results_wd <- paste0("C:/Users/", username, "/Dropbox/CFPB SLCCU Credit Matters Loan Evaluation/CBL_replication_package/04_tables/02_individual_tables")
dta <- read_dta(paste0(dta_wd,"/cf ficoscore08numlns.dta"))

#To Run in the server
# dta_wd <- "/home/ekp4095/CBL/data"
# results_wd <- "/home/ekp4095/CBL/results"
# dta <- read_dta(paste0(dta_wd,"/cf ficoscore08numlns.dta"))


## Directories - CHANGE!
for (e in 1:3) {

  ###########
  ## Cleaning
  ###########
  
  ## Converting categoricals to dummies?
  dta[,sapply(dta, class) == "haven_labelled"] = 
    lapply(dta[,sapply(dta, class) == "haven_labelled"], as.numeric)
  
  cov <- c(           'svymiss',
                      'creditmiss',
                      'age',
                      'female',
                      'race_black',
                      'married',
                      'adults',
                      'child',
                      'college',
                      'inclt30',
                      'z_insecurity_i',
                      'z_selfcont_i',
                      'z_risk_i',
                      'z_credstatus_i',
                      'z_credknow_i',
                      'bfico',
                      'tradelines1',
                      'savingsbal_mehb95',
                      'savcheck_mehb95',
                      'slccumemb',
                      'binnoncblslcb',
                      'z_default_index_all_ib',
                      'z_amountsowed_ib',
                      'z_newcredit_ib',
                      'z_liquid_ib')
  
  idvar <-            'surveyid'
  treatment_var <-    'enc'
  outcome_var   <-    'ficoscore08'
  
  # remove missing outcomes
  clean_dta <- dta[!is.na(dta$ficoscore08),]

    period <- paste("endline", e, sep="")
    assign("clean_dta", clean_dta %>% filter(!!as.name(period)==1))

    ####################
    # Variable selection
    ####################
    
    ## Outcomes of interest
    Y <- as.numeric(clean_dta$ficoscore08)
    
    ## Treatment variable
    W <- as.integer(clean_dta$enc) 
    
    ## Vector of covariates 
    X <- clean_dta[,c(cov)]
    
    ## Cluster variable
    cluster <- as.integer(clean_dta$surveyid)
    
    #################
    ## Causal forests
    #################
    
    ## We will test 3 models, increasing the number of trees in each one
    
    ## Defining different seeds for tests
    seeds <- c(746) ## 3 seeds at least
    
    ## defining different number of trees - CHANGE!
    num_trees <- c(50000) ## For running in the server
    
    
    ## Looping through different number of trees
    for(t in 1:length(num_trees)){
      
      ## Looping through different seeds
      for(j in 1:length(seeds)){
        
        ## First estimation of Y and W to use in models
        Y_forest = regression_forest(X, Y, seed = seeds[j], clusters = cluster)
        Y_hat = predict(Y_forest)$predictions
        W_forest = regression_forest(X, W, seed = seeds[j], clusters = cluster)
        W_hat = predict(W_forest)$predictions
        
        
        ## 1) Selecting variables with high importance
        ## Running forest
        cf <- causal_forest(X, Y, W,
                            Y.hat = Y_hat, W.hat = W_hat,
                            clusters = cluster,
                            equalize.cluster.weights = FALSE, 
                            seed = seeds[j])
        
        ## Select variables with percentage importance above mean and save in new vector
        x_cf1 <- variable_importance(cf)
        X_selected <- which(variable_importance(cf)>mean(variable_importance(cf)))
        
        ## 2) Model with variables selected in 1
        ## Running forest and save in temp object
        set.seed(seeds[j])
        cf <- causal_forest(X[, X_selected], Y, W,
                            Y.hat = Y_hat, W.hat = W_hat,
                            num.trees = num_trees[t],
                            seed = seeds[j],
                            honesty = TRUE,
                            stabilize.splits = TRUE,
                            clusters = cluster,
                            equalize.cluster.weights = FALSE, 
                            tune.parameters = "none")
        
        ## Save forest in object with new name
        assign(paste("cf", paste("t", t, sep=""), paste("s", j, sep=""), sep ="_"), cf)
        
        ## Predicted HTE (using out of bag predictions)
        pred_temp <- predict(cf, estimate.variance = TRUE)
        pred_temp_q <- ntile(pred_temp$predictions, 10)
        pred_temp_q3 <- ntile(pred_temp$predictions, 3)
        
        ## ATE by terciles
        ate_t_temp <- lapply(
          seq(3), function(w) {
            ate <- average_treatment_effect(cf, subset = pred_temp_q3 == w)
          })

        ## Create vectors for var importance with correct dimensions for merging
        varimp <-variable_importance(cf) #weighted sum of how many times feature i was split on at each depth in the forest
        len <- length(names(X)) - length(variable_importance(cf))
        varimp <- append(varimp, rep(NA, len), after = len) #add NAs for all of the other X variables that are not in the importance vector
        names_varimp <- names(X[X_selected]) #pull the names of the important variables 
        names_varimp <- append(names_varimp, rep(NA, len), after = len) #again add NAs for the names of the rest; 
        

        if (j==1 & t==1){
          ## Predicted HTE (tau hat)
          HTE_pred_df <- clean_dta
          HTE_pred_df[,"y_hat"] <- Y_hat
          HTE_pred_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "pred", sep ="_")]] <- pred_temp$predictions
          HTE_pred_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "se", sep ="_")]] <- pred_temp$variance.estimates
          
          
          ## Percetage of importance of each x in models
          var_imp_df <- data.frame(varimp, names_varimp) ## Empty data frame first time
          colnames(var_imp_df) <- c(paste(paste("t", t, sep=""), paste("s", j, sep=""), "perc", sep="_"), paste(paste("t", t, sep=""), paste("s", j, sep=""), "var", sep="_"))
          
          
          ## ATE
          ATE_t_df <-data.frame(do.call(rbind, ate_t_temp))
          colnames(ATE_t_df) <- c(paste(paste("t", t, sep=""), paste("s", j, sep=""), "atet", sep="_"), paste(paste("t", t, sep=""), paste("s", j, sep=""), "set", sep="_"))
         
          
           ## Test for heterogeity
          #test_cal_df <- data.frame(test_calibration(cf)[2], test_calibration(cf)[8], test_calibration(cf)[1], test_calibration(cf)[7]) ## Empty data frame first time
          #colnames(test_cal_df) <- c(paste(paste("t", t, sep=""), paste("s", j, sep=""), "test", sep="_"), paste(paste("t", t, sep=""), paste("s", j, sep=""), "pval", sep="_"))
          test_cal_df <- data.frame(test_calibration(cf)[2], test_calibration(cf)[4], test_calibration(cf)[6], test_calibration(cf)[8], 
                                    test_calibration(cf)[1], test_calibration(cf)[3], test_calibration(cf)[5], test_calibration(cf)[7], nrow(cf$predictions)) ## Empty data frame first time
          colnames(test_cal_df) <- c(paste(paste("t", t, sep=""), paste("s", j, sep=""), "pred", sep="_"), 
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "se", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "tval", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "pval", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestpred", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestse", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestval", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestpval", sep="_"),
                                     paste(paste("t", t, sep=""), paste("s", j, sep=""), "N", sep="_"))     
        }
        else {
          ## Predicted HTE (tau hat)
          HTE_pred_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "pred", sep ="_")]] <- pred_temp$predictions
          HTE_pred_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "se", sep ="_")]] <- pred_temp$variance.estimates
          ## Percetage of importance of each x in models
          var_imp_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "perc", sep ="_")]] <- varimp
          var_imp_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "var", sep ="_")]] <- names_varimp
          
          ## ATE by quintiles
          ATE_t_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "atet", sep ="_")]] <- do.call(rbind, ate_t_temp)[, 1]
          ATE_t_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "set", sep ="_")]] <- do.call(rbind, ate_t_temp)[, 2]
          
          
          ## Test for heterogeity
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "pred", sep ="_")]] <- test_calibration(cf)[2]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "se", sep ="_")]] <- test_calibration(cf)[4]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "tval", sep ="_")]] <- test_calibration(cf)[6]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "pval", sep ="_")]] <- test_calibration(cf)[8]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestpred", sep ="_")]] <- test_calibration(cf)[1]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestse", sep ="_")]] <- test_calibration(cf)[3]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforesttval", sep ="_")]] <- test_calibration(cf)[5]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "meanforestpval", sep ="_")]] <- test_calibration(cf)[7]
          test_cal_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "N", sep ="_")]] <- nrow(cf$predictions)
          
        }
        ## Deciles for predicted HTE
        HTE_pred_df[[paste(paste("t", t, sep=""), paste("s", j, sep=""), "q3", sep ="_")]] <- pred_temp_q3
      }
    }
    
    ## Save data frames into csv
    write.csv(HTE_pred_df, paste(results_wd, paste("/HTE_ficoscore08_numloans_e", e, "_df", sep=""), ".csv", sep = ""), row.names = FALSE)
    write.csv(var_imp_df, paste(results_wd, paste("/varimp_ficoscore08_numloans_e",e , "_df", sep=""), ".csv", sep = ""), row.names = FALSE)
    write.csv(test_cal_df, paste(results_wd, paste("/testcal_ficoscore08_numloans_e", e, "_df", sep=""), ".csv", sep = ""), row.names = FALSE)
    write.csv(ATE_t_df, paste(results_wd, paste("/ATEt_ficoscore08_numloans_e", e,  "_df", sep="_"), ".csv", sep = ""), row.names = FALSE)
    
    ## Cleaning workspace 
    #rm(cf, pred_temp_q, ate_t_temp)
    
    ###################
    ## Saving workspace
    ###################
    
    #save.image(paste(results_wd,"/causalforestficoscore08numloans_6mos.RData", sep = "")) 
    
}